{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "72505747",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pyspark"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "bd55afbe",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'/home/alexey/spark/spark-3.0.3-bin-hadoop3.2/python/pyspark/__init__.py'"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pyspark.__file__"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "29f1cf4c",
   "metadata": {},
   "outputs": [],
   "source": [
    "from pyspark.sql import SparkSession"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "cf6d80ad",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "WARNING: An illegal reflective access operation has occurred\n",
      "WARNING: Illegal reflective access by org.apache.spark.unsafe.Platform (file:/home/alexey/spark/spark-3.0.3-bin-hadoop3.2/jars/spark-unsafe_2.12-3.0.3.jar) to constructor java.nio.DirectByteBuffer(long,int)\n",
      "WARNING: Please consider reporting this to the maintainers of org.apache.spark.unsafe.Platform\n",
      "WARNING: Use --illegal-access=warn to enable warnings of further illegal reflective access operations\n",
      "WARNING: All illegal access operations will be denied in a future release\n",
      "22/02/15 22:22:51 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n",
      "Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties\n",
      "Setting default log level to \"WARN\".\n",
      "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n"
     ]
    }
   ],
   "source": [
    "spark = SparkSession.builder \\\n",
    "    .master(\"local[*]\") \\\n",
    "    .appName('test') \\\n",
    "    .getOrCreate()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "3f604529",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "--2022-02-15 22:23:22--  https://s3.amazonaws.com/nyc-tlc/misc/taxi+_zone_lookup.csv\n",
      "Resolving s3.amazonaws.com (s3.amazonaws.com)... 54.231.196.8\n",
      "Connecting to s3.amazonaws.com (s3.amazonaws.com)|54.231.196.8|:443... connected.\n",
      "HTTP request sent, awaiting response... 200 OK\n",
      "Length: 12322 (12K) [application/octet-stream]\n",
      "Saving to: ‘taxi+_zone_lookup.csv’\n",
      "\n",
      "taxi+_zone_lookup.c 100%[===================>]  12.03K  --.-KB/s    in 0s      \n",
      "\n",
      "2022-02-15 22:23:23 (114 MB/s) - ‘taxi+_zone_lookup.csv’ saved [12322/12322]\n",
      "\n"
     ]
    }
   ],
   "source": [
    "!wget https://s3.amazonaws.com/nyc-tlc/misc/taxi+_zone_lookup.csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "12342345",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\"LocationID\",\"Borough\",\"Zone\",\"service_zone\"\r",
      "\r\n",
      "1,\"EWR\",\"Newark Airport\",\"EWR\"\r",
      "\r\n",
      "2,\"Queens\",\"Jamaica Bay\",\"Boro Zone\"\r",
      "\r\n",
      "3,\"Bronx\",\"Allerton/Pelham Gardens\",\"Boro Zone\"\r",
      "\r\n",
      "4,\"Manhattan\",\"Alphabet City\",\"Yellow Zone\"\r",
      "\r\n",
      "5,\"Staten Island\",\"Arden Heights\",\"Boro Zone\"\r",
      "\r\n",
      "6,\"Staten Island\",\"Arrochar/Fort Wadsworth\",\"Boro Zone\"\r",
      "\r\n",
      "7,\"Queens\",\"Astoria\",\"Boro Zone\"\r",
      "\r\n",
      "8,\"Queens\",\"Astoria Park\",\"Boro Zone\"\r",
      "\r\n",
      "9,\"Queens\",\"Auburndale\",\"Boro Zone\"\r",
      "\r\n"
     ]
    }
   ],
   "source": [
    "!head taxi+_zone_lookup.csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "809464d0",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = spark.read \\\n",
    "    .option(\"header\", \"true\") \\\n",
    "    .csv('taxi+_zone_lookup.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "e36dd996",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+----------+-------------+--------------------+------------+\n",
      "|LocationID|      Borough|                Zone|service_zone|\n",
      "+----------+-------------+--------------------+------------+\n",
      "|         1|          EWR|      Newark Airport|         EWR|\n",
      "|         2|       Queens|         Jamaica Bay|   Boro Zone|\n",
      "|         3|        Bronx|Allerton/Pelham G...|   Boro Zone|\n",
      "|         4|    Manhattan|       Alphabet City| Yellow Zone|\n",
      "|         5|Staten Island|       Arden Heights|   Boro Zone|\n",
      "|         6|Staten Island|Arrochar/Fort Wad...|   Boro Zone|\n",
      "|         7|       Queens|             Astoria|   Boro Zone|\n",
      "|         8|       Queens|        Astoria Park|   Boro Zone|\n",
      "|         9|       Queens|          Auburndale|   Boro Zone|\n",
      "|        10|       Queens|        Baisley Park|   Boro Zone|\n",
      "|        11|     Brooklyn|          Bath Beach|   Boro Zone|\n",
      "|        12|    Manhattan|        Battery Park| Yellow Zone|\n",
      "|        13|    Manhattan|   Battery Park City| Yellow Zone|\n",
      "|        14|     Brooklyn|           Bay Ridge|   Boro Zone|\n",
      "|        15|       Queens|Bay Terrace/Fort ...|   Boro Zone|\n",
      "|        16|       Queens|             Bayside|   Boro Zone|\n",
      "|        17|     Brooklyn|             Bedford|   Boro Zone|\n",
      "|        18|        Bronx|        Bedford Park|   Boro Zone|\n",
      "|        19|       Queens|           Bellerose|   Boro Zone|\n",
      "|        20|        Bronx|             Belmont|   Boro Zone|\n",
      "+----------+-------------+--------------------+------------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "df.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "cb547351",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r",
      "[Stage 4:>                                                          (0 + 1) / 1]\r",
      "\r",
      "                                                                                \r"
     ]
    }
   ],
   "source": [
    "df.write.parquet('zones')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "02fe2bdb",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "total 28K\r\n",
      "-rw-rw-r-- 1 alexey alexey 6.8K Feb 15 22:25 Untitled.ipynb\r\n",
      "-rw-rw-r-- 1 alexey alexey  13K Aug 17  2016 taxi+_zone_lookup.csv\r\n",
      "drwxr-xr-x 2 alexey alexey 4.0K Feb 15 22:25 zones\r\n"
     ]
    }
   ],
   "source": [
    "!ls -lh"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "659f0812",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
